In this (non-preregistered) analysis, we look at variables collected at years 12 & 16 which have low levels of missingness, and test the effect of selecting only those who took part at year 21 and 26 time points (“attritioning”).
This analysis examines the impact of multiple imputation to correct for attrition bias at three different time points.
We compare ( original ) vs ( attritioned + imputed ) results across 200 multiply imputed datasets.
Load Data
Code
source("0_load_data.R")df=df%>%filter(!(randomfamid%in%exclude_fams_rq1x))%>%filter(!(randomfamid%in%exclude_fams_onesib))%>%filter(!(randomfamid%in%exclude_fams_rq6y))## range of participation outcomes to use for attritionrange_participation_outcomes=6:8# Subset vectors to only include participation outcomes in rangerq1y_twin=rq1y_twin[range_participation_outcomes]rq1y_twin1=rq1y_twin1[range_participation_outcomes]rq1y_twin2=rq1y_twin2[range_participation_outcomes]rq1y_twin_labels=rq1y_twin_labels[range_participation_outcomes]rq1y_twin_labels_clean=rq1y_twin_labels_clean[range_participation_outcomes]rq1y_twin_labels_clean_extrashort=rq1y_twin_labels_clean_extrashort[range_participation_outcomes]boot_compare_results=readRDS(file.path("results","8_rq6_boot_compare_results.Rds"))# Number of imputations per timepointn_imputations=length(boot_compare_results)/length(range_participation_outcomes)
cbind(rq1y_twin1,rq1y_twin_labels_clean)%>%knitr::kable(caption ="Attrition time points")
Attrition time points
rq1y_twin1
rq1y_twin_labels_clean
u1cdata1
Y21 (TEDS21 phase-1 questionnaire)
zmhdata1
Y26 (TEDS26 questionnaire)
zcdata1
Y26 (CATSLife web tests)
Table of all variables used in multiple imputation
Code
# Function to map variable prefix to study waveget_study_wave=function(var_name){prefix=substr(var_name, 1, 1)wave_map=c("a"="1st Contact","b"="2 Year","c"="3 Year","d"="4 Year","e"="In Home","g"="7 Year","h"="8 Year","i"="9 Year","j"="10 Year","l"="12 Year","n"="14 Year","p"="16 Year","r"="18 Year","u"="21 Year","z"="26 Year")return(wave_map[prefix])}rq6_imputation_vars=c(rq6y_all, rq6z_vars)impute_vars_labels=var_to_label(rq6_imputation_vars)%>%sapply(., function(x)ifelse(is.null(x[1]), "", x[1]))# Create formatted table for imputation variablesv_impute=data.frame( Description =impute_vars_labels, `Teds Code` =ifelse(rq6_imputation_vars%in%original_colnames, rq6_imputation_vars, paste0(rq6_imputation_vars, "*")), `Range or Level` =sapply(rq6_imputation_vars, function(var){if(var%in%colnames(df)){if(class(df[[var]])=="numeric"){paste0(round(min(df[[var]], na.rm =TRUE), 2), " - ", round(max(df[[var]], na.rm =TRUE), 2))}elseif(is.factor(df[[var]])){factor_levels=levels(df[[var]])paste(c(paste0(factor_levels[1], "*"), factor_levels[-1]), collapse =", ")}else{paste(unique(df[[var]]), collapse =", ")}}else{"Variable not found"}}), N =sapply(rq6_imputation_vars, function(var){if(var%in%colnames(df)){sum(!is.na(df[[var]]))}else{0}}), `Study.Wave` =sapply(rq6_imputation_vars, get_study_wave))gt(v_impute)%>%cols_label_with(fn =~gsub("\\.", " ", .x))%>%tab_style( style =cell_text(size =px(12)), locations =cells_body())%>%tab_style( style =cell_text(size =px(12)), locations =cells_column_labels())%>%cols_hide(N)%>%tab_source_note( source_note ="Note: Range or Level shows min-max values for numeric variables or factor levels for categorical variables (reference level marked with *). Variable codes with an asterisk (*) have been derived or modified from the original dataset.")%>%tab_options( table.width ="70%")%>%cols_width(Description~px(300),everything()~px(120))
Description
Teds Code
Range or Level
Study Wave
MFQ scale from 11 MFQ items (child self-report) at 12, 0-22
lcmfqt1
0 - 22
12 Year
SDQ Externalising scale at 12
lsdqext1*
0 - 20
12 Year
G composite scale from child web tests at 12, standardised
lcg1
-3.67 - 3.04
12 Year
Core subjects (English, maths, science): mean grade in GCSE results (twin exams at 16), 4-11
pcexgcsecoregrdm1
4 - 11
16 Year
MFQ scale from 11 MFQ items (child self-report) at 12, 0-22
lcmfqt2
0 - 22
12 Year
lsdqext2*
0 - 20
12 Year
G composite scale from child web tests at 12, standardised
lcg2
-3.67 - 3.04
12 Year
Core subjects (English, maths, science): mean grade in GCSE results (twin exams at 16), 4-11
pcexgcsecoregrdm2
4 - 11
16 Year
School cohort, see value labels
cohort
Cohort 1: twins born Jan-94 to Aug-94*, Cohort 2: twins born Sep-94 to Aug-95, Cohort 3: twins born Sep-95 to Aug-96, Cohort 4: twins born Sep-96 to Dec-96
3 Year
Age in years of natural mother at time of birth of twins
amumagetw
16 - 45
1st Contact
Single Parent
asingle*
cohabiting biological mother and father / cohabiting biological parent with other*, single parent
1st Contact
Mother SOC employment level (1st Contact), 1-9
amosoc2*
caring for children at home*, 1, 2, 3, 4, 5, 6, 7, 8, 9, no job
1st Contact
Maternal Education (formatted as numeric)
amohqualn*
1 - 8
1st Contact
Ethnic origin of twins, simplified coding (1st Contact), 1=white 0=other
aethnic
0 - 1
1st Contact
Main language spoken at home (1st Contact), see value labels
alang
other*, English, English + other
1st Contact
Number of older siblings (formatted as numeric variable)
anoldsibn*
0 - 5
1st Contact
Number of younger siblings (formatted as numeric variable)
anyngsibn*
0 - 2
1st Contact
Member of a Twins Club (1st Contact), 1Y 0N
atwclub
0*, 1
1st Contact
Census data 2001 (code KS001) linked to 1998 postcode: population density, N per hectare
cens01pop98density
0 - 310
3 Year
Smoked cigarettes while pregnant (1st Contact), 1Y 0N
Child G composite (7 year twin phone), standardised
gcg1
-4.36 - 4.84
7 Year
Child G composite (7 year twin phone), standardised
gcg2
-4.36 - 4.84
7 Year
2-subject (maths, English) mean NC level (7 year teacher), 0-4
gt2ac1
0 - 4
7 Year
2-subject (maths, English) mean NC level (7 year teacher), 0-4
gt2ac2
0 - 4
7 Year
SDQ hyperactivity total (7 year parent), 0-10
gpsdqhypt1
0 - 10
7 Year
SDQ hyperactivity total (7 year parent), 0-10
gpsdqhypt2
0 - 10
7 Year
SDQ conduct total (7 year parent), 0-10
gpsdqcont1
0 - 10
7 Year
SDQ conduct total (7 year parent), 0-10
gpsdqcont2
0 - 10
7 Year
SDQ emotion total (7 year parent), 0-10
gpsdqemot1
0 - 10
7 Year
SDQ emotion total (7 year parent), 0-10
gpsdqemot2
0 - 10
7 Year
ARBQ overall anxiety total (7 year parent), 0-52
gpanxt1
0 - 46
7 Year
ARBQ overall anxiety total (7 year parent), 0-52
gpanxt2
0 - 46
7 Year
Conners ADHD overall Total at 8 (0-54)
hconnt1
0 - 54
8 Year
Conners ADHD overall Total at 8 (0-54)
hconnt2
0 - 54
8 Year
G composite (9 year child), standardised
icg1
-3.63 - 2.24
9 Year
G composite (9 year child), standardised
icg2
-3.63 - 2.24
9 Year
3-subject (English, maths, science) mean NC level (9 year teacher), 1-5
it3ac1
1 - 5
9 Year
SDQ Conduct scale (child self-report) at 9, 0-10
icsdqcont1
0 - 10
9 Year
SDQ Conduct scale (child self-report) at 9, 0-10
icsdqcont2
0 - 10
9 Year
SDQ Emotion scale (child self-report) at 9, 0-10
icsdqemot1
0 - 10
9 Year
SDQ Emotion scale (child self-report) at 9, 0-10
icsdqemot2
0 - 10
9 Year
SDQ Hyperactivity scale (child self-report) at 9, 0-10
icsdqhypt1
0 - 10
9 Year
SDQ Hyperactivity scale (child self-report) at 9, 0-10
icsdqhypt2
0 - 10
9 Year
SDQ Hyperactivity scale (parent) at 9, 0-10
ipsdqhypt1
0 - 10
9 Year
SDQ Hyperactivity scale (parent) at 9, 0-10
ipsdqhypt2
0 - 10
9 Year
SDQ Conduct scale (parent) at 9, 0-10
ipsdqcont1
0 - 10
9 Year
SDQ Conduct scale (parent) at 9, 0-10
ipsdqcont2
0 - 10
9 Year
SDQ Emotion scale (parent) at 9, 0-10
ipsdqemot1
0 - 10
9 Year
SDQ Emotion scale (parent) at 9, 0-10
ipsdqemot2
0 - 10
9 Year
Negative parental feelings scale, child-self-rated at 9, 0-8
icparnegt1
0 - 8
9 Year
Negative parental feelings scale, child-self-rated at 9, 0-8
icparnegt2
0 - 8
9 Year
ARBQ overall anxiety total (9 year parent), 0-50
ipanxt1
0 - 38
9 Year
ARBQ overall anxiety total (9 year parent), 0-50
ipanxt2
0 - 38
9 Year
Parent Chaos scale at 9, 0-12
ipchatot
0 - 12
9 Year
Child G composite (10 year twin web), standardised
jcg1
-3.67 - 2.98
10 Year
Child G composite (10 year twin web), standardised
jcg2
-3.67 - 2.98
10 Year
3-subject (English, maths, science) mean NC level (10 year teacher), 1-5
jt3ac1
1 - 5
10 Year
3-subject (English, maths, science) mean NC level (10 year teacher), 1-5
jt3ac2
1 - 5
10 Year
G composite scale from child web tests at 14, standardised
ncg1
-4.12 - 3.17
14 Year
G composite scale from child web tests at 14, standardised
ncg2
-4.12 - 3.17
14 Year
End of KS3 3-subject Academic achievement mean level (from parent SLQ), 1-9
npks3t3a1
1 - 9
14 Year
End of KS3 3-subject Academic achievement mean level (from parent SLQ), 1-9
npks3t3a2
1 - 9
14 Year
Conners Hyperactivity-Impulsivity scale at 14 (child), 0-27
ncconhit1
0 - 27
14 Year
Conners Hyperactivity-Impulsivity scale at 14 (child), 0-27
ncconhit2
0 - 27
14 Year
Conners total scale at 14 (parent), 0-54
npconnt1
0 - 54
14 Year
Conners total scale at 14 (parent), 0-54
npconnt2
0 - 54
14 Year
Negative Parental Feelings scale at 14 (child), 0-8
ncparnegt1
0 - 8
14 Year
Negative Parental Feelings scale at 14 (child), 0-8
ncparnegt2
0 - 8
14 Year
Chaos at home total scale at 14 (child), 0-12
ncchato1
0 - 11
14 Year
Chaos at home total scale at 14 (child), 0-12
ncchato2
0 - 11
14 Year
G composite scale from child web tests at 16, standardised
pcg1
-2.86 - 4.06
16 Year
G composite scale from child web tests at 16, standardised
pcg2
-2.86 - 4.06
16 Year
MFQ total scale (child behaviour qnr at 16), 0-26
pcbhmfqt1
0 - 26
16 Year
MFQ total scale (child behaviour qnr at 16), 0-26
pcbhmfqt2
0 - 26
16 Year
SDQ Emotion scale (child behaviour qnr at 16), 0-10
pcbhsdqemot1
0 - 10
16 Year
SDQ Emotion scale (child behaviour qnr at 16), 0-10
pcbhsdqemot2
0 - 10
16 Year
SDQ Conduct scale (child behaviour qnr at 16), 0-10
pcbhsdqcont1
0 - 10
16 Year
SDQ Conduct scale (child behaviour qnr at 16), 0-10
pcbhsdqcont2
0 - 10
16 Year
SDQ Hyperactivity scale (child behaviour qnr at 16), 0-10
pcbhsdqhypt1
0 - 10
16 Year
SDQ Hyperactivity scale (child behaviour qnr at 16), 0-10
pcbhsdqhypt2
0 - 10
16 Year
SDQ Conduct scale (parent behaviour qnr at 16), 0-10
ppbhsdqcont1
0 - 10
16 Year
SDQ Conduct scale (parent behaviour qnr at 16), 0-10
ppbhsdqcont2
0 - 10
16 Year
SDQ Hyperactivity scale (parent behaviour qnr at 16), 0-10
ppbhsdqhypt1
0 - 10
16 Year
SDQ Hyperactivity scale (parent behaviour qnr at 16), 0-10
ppbhsdqhypt2
0 - 10
16 Year
Chaos total score (child web at 16), 0-12
pcchatot1
0 - 12
16 Year
Chaos total score (child web at 16), 0-12
pcchatot2
0 - 12
16 Year
UCAS point score total estimate from known UK exam results (twin qnr at 18)
rcqucast1
20 - 910
18 Year
UCAS point score total estimate from known UK exam results (twin qnr at 18)
rcqucast2
20 - 910
18 Year
Twin probable highest level of qualification including current study (TEDS21 phase 1 twin qnr), 1-11 see value labels
u1chqualp1
1 - 11
21 Year
Twin probable highest level of qualification including current study (TEDS21 phase 1 twin qnr), 1-11 see value labels
u1chqualp2
1 - 11
21 Year
SDQ Conduct total score (TEDS21 phase 1 twin qnr), 0-10
u1csdqcont1
0 - 9
21 Year
SDQ Conduct total score (TEDS21 phase 1 twin qnr), 0-10
u1csdqcont2
0 - 9
21 Year
SDQ Hyperactivity total score (TEDS21 phase 1 twin qnr), 0-10
u1csdqhypt1
0 - 10
21 Year
SDQ Hyperactivity total score (TEDS21 phase 1 twin qnr), 0-10
u1csdqhypt2
0 - 10
21 Year
Demographics item: highest qualification ordinal level (TEDS26 twin MHQ), see value labels
zmhhqual1
1 - 11
26 Year
Demographics item: highest qualification ordinal level (TEDS26 twin MHQ), see value labels
zmhhqual2
1 - 11
26 Year
SDQ Conduct total score (TEDS26 twin MHQ), 0-10
zmhsdqcont1
0 - 9
26 Year
SDQ Conduct total score (TEDS26 twin MHQ), 0-10
zmhsdqcont2
0 - 9
26 Year
SDQ Hyperactivity total score (TEDS26 twin MHQ), 0-10
zmhsdqhypt1
0 - 10
26 Year
SDQ Hyperactivity total score (TEDS26 twin MHQ), 0-10
zmhsdqhypt2
0 - 10
26 Year
Note: Range or Level shows min-max values for numeric variables or factor levels for categorical variables (reference level marked with *). Variable codes with an asterisk (*) have been derived or modified from the original dataset.
Data Processing
Transform Raw Results to Dataframe
Code
# Create pairwise correlation variable names (if needed for cor_resid later)test_correlation_matrix=matrix( nrow =length(rq6y), ncol =length(rq6y))for(iinseq_along(rq6y)){for(jinseq_along(rq6y)){test_correlation_matrix[i,j]=paste(rq6y[i], rq6y[j], collapse =" ")}}x=test_correlation_matrix[lower.tri(test_correlation_matrix, diag =FALSE)]x_var=str_extract(x, "^\\S+")y_var=str_extract(x, "\\S+$")rm(test_correlation_matrix,x)# Split results by timepointtimepoint_names=rq1y_twinimputed_comparisons_df=list()for(tpinseq_along(timepoint_names)){timepoint_name=timepoint_names[tp]# Get all imputations for this timepointtp_pattern=paste0("^", timepoint_name)tp_indices=grep(tp_pattern, names(boot_compare_results))# Extract results for each imputationtp_results=boot_compare_results[tp_indices]# For each metric (md, smd, var), aggregate across imputations# Using Rubin's rules for multiple imputationmetrics=c("md", "smd", "var")metric_results=list()for(metric_idxinseq_along(metrics)){metric=metrics[metric_idx]metric_data=lapply(tp_results, function(imp){# imp[[metric]] is a list of bootstrap iterations# Convert to matrix where rows = bootstrap iterations, cols = variablessapply(imp[[metric]], function(boot_iter)boot_iter)})all_boots=do.call(cbind, metric_data)# Calculate summary statistics across all bootstrap samplesmetric_summary=apply(all_boots, 1, function(xx).mean_qi_pd(xx))metric_summary_df=do.call(rbind, metric_summary)metric_summary_df=data.frame(metric_summary_df)metric_summary_df$dataset=rq1y_twin_labels_clean_extrashort[tp]metric_summary_df$stat=metricmetric_summary_df$variable=rq6y_labelsrownames(metric_summary_df)=NULLmetric_results[[metric_idx]]=metric_summary_df}# Combine metrics for this timepointimputed_comparisons_df[[tp]]=do.call(rbind.data.frame, metric_results)}# Combine all timepoints into single dataframeimputed_comparisons_df0=do.call(rbind.data.frame, imputed_comparisons_df)rownames(imputed_comparisons_df0)=NULL# Save for future usesaveRDS(imputed_comparisons_df0, file.path("results", "8_imputed_comparisons_df.Rds"))
Create Table
Code
# Process the dataframe# Note: Variance values are already percentage change from compare_var functionimputed_comparisons_df=imputed_comparisons_df0%>%mutate( variable =rq6y_labels[match(.$variable, rq6y_labels)])%>%group_by(dataset, stat)%>%mutate( pval_adj =stats::p.adjust(pval, method ="holm"))
Note. Est = Estimate, LB = Lower Bound 95% Confidence Interval, UB = Upper Bound 95% Confidence Interval. Significant (pBonferroni-Holm) effects are highlighted in green (increases) or red (decreases).
Note. Est = Estimate, LB = Lower Bound 95% Confidence Interval, UB = Upper Bound 95% Confidence Interval. Significant (pBonferroni-Holm) effects are highlighted in green (increases) or red (decreases).
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 5574031 297.7 10147732 542 8046225 429.8
Vcells 68501730 522.7 131988352 1007 124163771 947.3
Code
Sys.setenv(MAGICK_MEMORY_LIMIT ="8GiB")Sys.setenv(MAGICK_MAP_LIMIT ="8GiB")Sys.setenv(MAGICK_DISK_LIMIT ="16GiB")# Read all correlation plot imagesimg1=image_read("plots/pngs/6_rq6_correlations.png")img2=image_read("plots/pngs/7_rq7_correlations.png")img3=image_read("plots/pngs/8_rq6_correlations.png")# Add vertical spacing (white border) between imagesimg1=image_border(img1, "white", "0x20")# Add 20px white space at bottomimg2=image_border(img2, "white", "0x20")# Add 20px white space at bottom# Stack all plots verticallycombined=image_append(c(img1, img2, img3), stack =TRUE)# Save combined imageimage_write(combined, "plots/pngs/8_rq6_correlations_combined.png")knitr::include_graphics("plots/pngs/8_rq6_correlations_combined.png")
ACE Comparison
Code
# Extract ACE estimates and differences from all datasetsace_estimates_list=list()ace_differences_list=list()for(tpinseq_along(timepoint_names)){timepoint_name=timepoint_names[tp]# Get all imputations for this timepointtp_pattern=paste0("^", timepoint_name)tp_indices=grep(tp_pattern, names(boot_compare_results))# Extract results for each imputationtp_results=boot_compare_results[tp_indices]# Process ACE results across all imputationsace_results=lapply(tp_results, function(imp){lapply(imp$ace, function(x)x)})# Flatten and processace_results=unlist(ace_results, recursive =FALSE)ace_results=lapply(ace_results, function(x)do.call(rbind, x))ace_results=do.call(rbind, ace_results)%>%filter(par%in%c("a","c", "e"))ace_results_summary=ace_results%>%group_by(par, name, sex, group)%>%summarise(.mean_qi_pd(value), .groups ="drop")# Separate estimates and differencesace_estimates_list[[tp]]=ace_results_summary%>%filter(group!="diff")ace_differences_list[[tp]]=ace_results_summary%>%filter(group=="diff")# Add dataset identifierace_estimates_list[[tp]]$dataset=rq1y_twin1[tp]ace_differences_list[[tp]]$dataset=rq1y_twin1[tp]}
Warning: There was 1 warning in `summarise()`.
ℹ In argument: `.mean_qi_pd(value)`.
ℹ In group 39: `par = "c"`, `name = "lsdqext"`, `sex = "female"`, `group =
"diff"`.
Caused by warning:
! pd-values smaller than 0.5 detected, indicating inconsistent direction
of the probability mass. This usually happens when the parameters space
is not continuous. Affected values are set to 1. See help('p_direction')
for more info.
Code
# Combine into single dataframesace_estimates=do.call(rbind, ace_estimates_list)ace_differences=do.call(rbind, ace_differences_list)# Add readable dataset labelsace_estimates$dataset_label=rq1y_twin_labels_clean[match(ace_estimates$dataset, rq1y_twin1)]ace_differences$dataset_label=rq1y_twin_labels_clean[match(ace_differences$dataset, rq1y_twin1)]
1 Difference = (Attrition + Imputed) − Original. Adjusted P values are Bonferroni-Holm corrected within each sex, dataset, and parameter (A, C or E) group
ACE Estimates Plot
Code
ace_estimates%>%mutate( name_clean =rq6y_labels[match(name, rq6y_prefix)], name_clean =factor(name_clean, levels =rq6y_labels), group =case_when(group=="df1"~"O",group=="df2"~"I",TRUE~group), group =factor(group, levels =c("I", "O")), sex =str_to_title(sex), par =toupper(par), dataset_label =factor(rq1y_twin_labels_clean_extrashort[match(dataset,rq1y_twin1)], levels =rq1y_twin_labels_clean_extrashort))%>%ggplot(aes(x =group, y =y, fill =par))+geom_col(position ="stack", alpha =1)+facet_grid(sex+dataset_label~name_clean, switch ="x")+theme_minimal()+theme( axis.text.x =element_text(angle =0, hjust =.5, size =8), strip.text =element_text(size =8), strip.text.x =element_text(angle =90, hjust =1, vjust =.5), strip.text.y =element_text(angle =0), strip.placement ="outside", panel.grid =element_blank(), plot.tag =element_text(hjust =0, vjust =0, size =24, face ="bold"), plot.tag.position ="topleft")+labs( title ="Attritioned + Imputed vs Original", x =NULL, y ="Proportion", fill ="Component", tag ="C")+scale_y_continuous(breaks =c(.25,.50,.75, 1), labels =c(".25",".50",".75","1.00"))+scale_fill_manual(values =c("A"="#d73027", "C"="#fee08b", "E"="#4575b4"))+coord_cartesian(ylim =c(0, 1))